Discrete Distribution Sampler

Metrics and Estimates

Statisticians often use the term estimate for a value calculated from the data at hand, to draw a distinction between what we see from the data and the theoretical true or exact state of affairs. Data scientists and business analysts are more likely to refer to such a value as a metric. The difference reflects the approach of statistics versus that of data science: accounting for uncertainty lies at the heart of the discipline of statistics, whereas concrete business or organizational objectives are the focus of data science. Hence, statisticians estimate, and data scientists measure.

Peter Bruce, Andrew Bruce, Peter Gedeck - Practical Statistics for Data Scientists_ 50+ Essential Concepts Using R and Python-O’Reilly Media (2020)

1. Discrete Distribution Sampler

The Discrete Sampler class was designed to randomly pick samples from a list of possible values. For Monte Carlo methods, sampling from a pre-existing value distribution is needed. The Discrete_Distribution_Sampler samples values from a list of values using one of the following approaches:

UniformSample: Select a random value from the distribution that is uniformly sampled from the minimum to maximum sample of distribution:
sample = random.uniform(min(distribution), max(distribution))
BinSample: Simply pick a random value from the distribution (ie. from an ordered list). Also known in the literature as bootstrapping
sample = distribution[random.uniform(num_elements)]
Jittered BinSample: Pick a random value from the distribution (BinSample) and then add a proportionally small amount of random noise.
sample = BinSampleDistribution + jittered_noise

Code

import numpy as np
import pandas as pd
import math as math
import statistics as stats
import random
from collections import Counter
import khipu_kamayuq as kamayuq  # A Khipu Maker is known (in Quechua) as a Khipu Kamayuq
import khipu_qollqa as kq
import khipu_utils as ku
from pandas import Series, DataFrame

# Plotly
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode(connected = False)

Code

class Discrete_Sampler():
    def __init__(self, values):
        values.sort()
        self.values = values
        self.log_values = [math.log(x) for x in self.values if x > 0]
        self.min_value = min(values)
        self.max_value = max(values)
        self.mean_value = stats.mean(values)
        self.stddev = stats.stdev(values)
        counter = Counter(values)
        self.max_occurences = counter.most_common(1)[0][1]
        self.frequency_counter = sorted(counter.most_common(), key=lambda x: x[0])
        self.np_frequency_values = np.array([y_val for (y_val, num_occurences) in self.frequency_counter])

    def __repr__(self):
        the_rep = f"Discrete_Sampler({len(self.values)} samples=)"
        the_rep += f"\n\t {self.min_value=} {self.max_value=} {self.mean_value=} {self.stddev=}"
        the_rep += f"\n\t {self.frequency_counter[0:5]=}"
        the_rep += f"\n\t {self.max_occurences=}"
        the_rep += "\n"
        return the_rep
        
    def plot_violin_frequency(self, title=None, use_log_value=True):
        df = pd.DataFrame(zip(self.values, self.log_values), columns=['value', 'log_value'])
        legend_text = title if title else "<b>Sample Values</b>"
        legend_text += " <span style=\"font-size:.8em;\">Hover over points to see values</span>"
        fig = (px.violin(df, 
                 y="log_value" if use_log_value else "value",  
                 points='all',
                 labels={"log_value": "Log<sub>e</sub>(Value)", "value": "Value"},
                 hover_data=['value', 'log_value'], 
                 title=legend_text,
                 width=944, height=944).show())

    def uniform_sample(self, num_samples=1):
        return [random.uniform(self.min_value, self.max_value) for _ in range(num_samples)]

    def bin_sample(self, num_samples=1):
        return [random.choice(self.values) for _ in range(num_samples)]
    
    def jittered_bin_sample(self, num_samples=1):
        def jitter_sample(x): 
            sample = x + (random.uniform(-0.5, 0.5))*.1*x
            return ku.clip(sample, self.min_value, self.max_value)
        samples = self.bin_sample(num_samples=num_samples)
        return [jitter_sample(x) for x in samples]
    
    def continuous_sample(self, num_samples=1):
        def one_sample(x, y):
            # Test to see if the point x,y (num_occurences, value) is in the cumulative density distribution
            # y_index = sum([(y_val < y) for (y_val, num_occurences) in self.frequency_counter])
            y_index = len(self.np_frequency_values[self.np_frequency_values < y]) # Faster than the above
            if y_index > len(self.frequency_counter)-1: return None
            next_y_index = y_index+1 if y_index < len(self.frequency_counter)-1 else y_index

            (y1,y2) = (self.np_frequency_values[y_index], self.np_frequency_values[next_y_index])
            lerp_ratio = 0 if y2-y1==0 else float(y-y1)/float(y2-y1)
            (x1, x2) = (self.frequency_counter[y_index][1], self.frequency_counter[next_y_index][1])
            interpolated_num_occurences = round(ku.linear_interpolate(x1,x2, lerp_ratio))

            # In the distribution???
            in_CDF = ((x <= interpolated_num_occurences) and
                      (y <= self.max_value) and 
                      (y_index < len(self.frequency_counter)-2))
          
            # If the point is in the distribution, then return the interpolated value
            return y if in_CDF else None

        # Now start generating random sample points, and add them if they are in the distribution
        samples = []
        while len(samples) < num_samples:
            y = random.randint(self.min_value, self.max_value)
            # Trim x to be left triangular part of "rect of [max occurences X max value]"
            # This reduces number of false sample points by half, speeding up algorithm considerably
            lerp_ratio = (y-self.min_value)/(self.max_value-self.min_value)
            x = random.randint(1, round(ku.linear_interpolate(1,self.max_occurences, lerp_ratio)))
            sample = one_sample(x, y)
            if sample: 
                samples.append(round(sample))

 
        return samples

2. Building a sample distribution

The database of KFG cord values will be used as the sample distribution. Note that 0 values are ignored. This is reasonable since it makes no difference to summers for 0 cords (they are ignored). It also allows us to see log values of the distribution more easily.

Code

# Use NON_ZERO cords in the khipu database as sample values
(khipu_dict, all_khipus) = kamayuq.fetch_khipus()
sample_values = []
for aKhipu in all_khipus:
    sample_values += [aCord.knotted_value() for aCord in aKhipu.pendant_cords() if aCord.knotted_value() > 0]


the_discrete_sampler = Discrete_Sampler(sample_values)
print(f"{the_discrete_sampler=}")
the_discrete_sampler.plot_violin_frequency(title="Discrete Samples", use_log_value=False)
the_discrete_sampler.plot_violin_frequency(title="Discrete Samples - Log<sub>e</sub>(Value)", use_log_value=True)

the_discrete_sampler=Discrete_Sampler(28465 samples=)
     self.min_value=1 self.max_value=320535 self.mean_value=286 self.stddev=3139.2709344687023
     self.frequency_counter[0:5]=[(1, 3045), (2, 2144), (3, 1404), (4, 1366), (5, 998)]
     self.max_occurences=3045

3. Uniform Samples

Let’s test uniform samples:

Code

uniform_samples = the_discrete_sampler.uniform_sample(1000)
uniform_sampler = Discrete_Sampler(uniform_samples)
print(f"{uniform_sampler=}")
uniform_sampler.plot_violin_frequency(title="Uniform Samples", use_log_value=False)
uniform_sampler.plot_violin_frequency(title="Uniform Samples - Log<sub>e</sub>(Value)", use_log_value=True)

uniform_sampler=Discrete_Sampler(1000 samples=)
     self.min_value=143.5002344522874 self.max_value=320400.3074273993 self.mean_value=163023.33221971305 self.stddev=91847.21271275752
     self.frequency_counter[0:5]=[(143.5002344522874, 1), (466.432204835245, 1), (1182.9455794369978, 1), (1204.0978116944552, 1), (1275.6019331635378, 1)]
     self.max_occurences=1

4. Bin Samples

Code

bin_samples = the_discrete_sampler.bin_sample(1000)
bin_sampler = Discrete_Sampler(bin_samples)
print(f"{bin_sampler=}")

bin_sampler.plot_violin_frequency(title="Bin Samples - Log<sub>e</sub>(Value)", use_log_value=True)

bin_sampler=Discrete_Sampler(1000 samples=)
     self.min_value=1 self.max_value=32587 self.mean_value=280 self.stddev=1789.7209279661452
     self.frequency_counter[0:5]=[(1, 115), (2, 69), (3, 48), (4, 50), (5, 38)]
     self.max_occurences=115

5. Jittered Bin Samples

Code

jittered_bin_samples = the_discrete_sampler.jittered_bin_sample(1000)
jittered_bin_sampler = Discrete_Sampler(jittered_bin_samples)
print(f"{jittered_bin_sampler=}")

jittered_bin_sampler.plot_violin_frequency(title="Jittered Bin Samples - Log<sub>e</sub>(Value)", use_log_value=True)

jittered_bin_sampler=Discrete_Sampler(1000 samples=)
     self.min_value=1.0 self.max_value=34185.68116257077 self.mean_value=211.44036686528116 self.stddev=1317.7529306684214
     self.frequency_counter[0:5]=[(1.0, 52), (1.0002270060089844, 1), (1.0012748442938642, 1), (1.0016610334961105, 1), (1.0023893107435031, 1)]
     self.max_occurences=52